home *** CD-ROM | disk | FTP | other *** search
- {$A+,B-,D-,E-,F-,G-,I-,L-,N-,O-,P-,Q-,R-,S+,T-,V+,X+}
- unit Index;
- {$DEFINE ANALYSE}
- {.$DEFINE BLOCK}
- interface
- const
- IdentSet = ['A'..'Z','a'..'z','0'..'9','-','+'];
- StartSet = ['A'..'Z','a'..'z'];
-
- const
- MaxPage = 255;
- MaxHits = 100; { out of 214 webpages }
-
- type
- TNumPage = 0..MaxPage; { max number of webpages in site }
- TURLPage = ShortString { assuming URL <= 255 characters };
-
- var
- WebPages: TNumPage = 0;
- WebPage: Array[TNumPage] of TURLPage;
- Titles: Array[TNumPage] of TURLPage;
-
- const
- MaxKeyword = 31-8;
-
- type
- TKeyword = String[MaxKeyword];
- TPageSet = Set of TNumPage;
-
- type
- TNode = record
- Keyword: TKeyword; { 32 bytes }
- URLs: TPageSet; { 32 bytes }
- end {TNode};
-
- TTree = class
- Node: TNode;
- constructor Create(const Keyword: TKeyword; WebPage: TNumPage);
- destructor Destroy; override;
- private
- Prev,Next: TTree;
- end {TTree};
-
- var
- Keywords: Integer = 0;
- root: TTree = nil;
-
- type
- {$IFDEF BLOCK}
- TIndexFile = File;
- {$ELSE}
- TIndexFile = File of TNode;
- {$ENDIF}
-
- implementation
- uses
- DrBobSys, SysUtils;
-
- constructor TTree.Create(const Keyword: TKeyword; WebPage: TNumPage);
- begin
- inherited Create;
- Inc(Keywords); // keep track of number of keywords
- Prev := nil;
- Next := nil;
- {$IFNDEF BLOCK}
- FillChar(Node.Keyword,MaxKeyword+1,#0); { sparse }
- {$ENDIF}
- Node.Keyword := Keyword;
- Node.URLs := [WebPage]
- end {Create};
-
- destructor TTree.Destroy;
- begin
- if Prev <> nil then Prev.Free;
- if Next <> nil then Next.Free;
- inherited Destroy
- end {Destroy};
-
- procedure AddKeyword(const Keyword: TKeyword; WebPage: TNumPage);
- var
- tmp: TTree;
- begin
- if root = nil then
- root := TTree.Create(Keyword,WebPage)
- else { search }
- begin
- tmp := root;
- repeat
- if tmp.Node.Keyword > Keyword then
- begin
- if tmp.Prev = nil then
- tmp.Prev := TTree.Create(Keyword,WebPage);
- tmp := tmp.Prev
- end
- else
- if tmp.Node.Keyword < Keyword then
- begin
- if tmp.Next = nil then
- tmp.Next := TTree.Create(Keyword,WebPage);
- tmp := tmp.Next
- end
- until tmp.Node.Keyword = Keyword;
- tmp.Node.URLs := tmp.Node.URLs + [WebPage]
- end
- end {AddKeyword};
-
- procedure ScanPage(const FileName: ShortString; WebPage: TNumPage);
- var
- f: Text;
- NotInTag: Boolean;
- Keyword: ShortString;
- Len: Byte absolute Keyword;
- begin
- assign(f,FileName);
- reset(f);
- if IOResult = 0 then
- begin
- writeln('<LI><B>',FileName,'</B>');
- Len := 0;
- while (Len = 0) and not eof(f) do
- begin
- readln(f,Keyword);
- if Pos('<TITLE>',UpperCase(Keyword)) > 0 then
- begin
- Delete(Keyword,1,Pos('<TITLE>',UpperCase(Keyword))+6);
- Delete(Keyword,Pos('</TITLE>',UpperCase(Keyword)),255)
- end
- else
- Len := 0
- end;
- if Len = 0 then writeln('- <I>has no title or description...</I>')
- else
- Titles[WebPages] := Keyword; { Title of Webpage }
- NotInTag := True;
- close(f);
- assign(f,FileName);
- reset(f); { second time }
- while not eof(f) do
- begin
- Len := 0;
- while not eoln(f) do
- begin
- Inc(Len);
- read(f,Keyword[Len]);
- if not (Keyword[Len] in IdentSet) then
- begin
- Dec(Len);
- if (Len > 2) and NotInTag then
- if (Len <= MaxKeyWord) then
- AddKeyword(LowerCase(Keyword),WebPage)
- else
- writeln('<BR>skipped keyword: ',Keyword);
- if Keyword[Len+1] = '>' then NotInTag := True
- else
- if Keyword[Len+1] = '<' then NotInTag := False;
- Len := 0
- end
- else
- if (Len = 1) then { start with letter ?? }
- if not (Keyword[1] in StartSet) then Len := 0
- end;
- if (Len > 2) and NotInTag then
- if (Len <= MaxKeyWord) then
- AddKeyword(LowerCase(Keyword),WebPage)
- else
- writeln('<BR>skipped keyword: ',Keyword);
- readln(f)
- end;
- close(f)
- end
- else
- writeln('<LI>',FileName); { failed to open }
- end {ScanPage};
-
- procedure ScanPages(const Path: ShortString);
- var
- SRec: TSearchRec;
- begin
- if FindFirst('*.*', faDirectory, SRec) = 0 then
- repeat
- if (SRec.Attr AND faDirectory) = faDirectory then
- begin
- if (SRec.Name[1] <> '.') then { skip '.' and '..' }
- if Pos('_vti',SRec.Name) = 0 then { _vti_cnf etc. }
- begin
- ChDir(SRec.Name);
- if IOResult = 0 then
- begin
- writeln('<LI><I>',SRec.Name,'</I>');
- writeln('<UL>');
- ScanPages(Path+'/'+SRec.Name);
- writeln('</UL>');
- ChDir('..')
- end
- else
- writeln('<LI><I>',SRec.Name,'</I> - locked')
- end
- end
- else { file }
- if ((Pos('.HTM',UpperCase(SRec.Name)) > 0) or
- (Pos('.ASP',UpperCase(SRec.Name)) > 0)) and
- (Pos('.bak',SRec.Name) = 0) then
- begin
- WebPage[WebPages] := Path + '/' + SRec.Name;
- ScanPage(SRec.Name,WebPages);
- Inc(WebPages)
- end
- until FindNext(SRec) <> 0;
- FindClose(SRec)
- end {ScanPages};
-
- function Pages(PageSet: TPageSet): Byte;
- var
- B: Byte;
- begin
- Result := 0;
- for B := 0 to MaxPage do
- if B in PageSet then Result := Result + 1
- end {Pages};
-
- procedure WriteTree(var IndexFile: TIndexFile; root: TTree);
- begin
- if root.Prev <> nil then WriteTree(IndexFile,root.Prev);
- if (Length(root.node.Keyword) > 3) or
- (Pages(root.node.URLs) <= MaxHits) then
- begin
- {$IFDEF BLOCK}
- BlockWrite(IndexFile,root.Node.Keyword[0],Ord(root.node.Keyword[0])+1);
- BlockWrite(IndexFile,root.Node.URLs,SizeOf(root.Node.URLs));
- {$ELSE}
- write(IndexFile,root.Node);
- {$ENDIF}
- Inc(Keywords) { counter }
- end
- else
- writeln('<LI>',root.Node.Keyword);
- if root.Next <> nil then WriteTree(IndexFile,root.Next)
- end {WriteTree};
-
- function WalkTreeLength(len,hits: Integer; root: TTree): Integer;
- { find words of Length len, that have <= hits pages }
- begin
- Result := 0;
- if root.Prev <> nil then
- Result := Result + WalkTreeLength(len,hits,root.Prev);
- if (len = 0) or (Length(root.Node.Keyword) = len) then
- if (Pages(root.node.URLs) <= hits) then
- Result := Result + 1;
- if root.Next <> nil then
- Result := Result + WalkTreeLength(len,hits,root.Next)
- end {WalkTreeLength};
-
- var
- i,j,k,l: Integer;
- Str: ShortString;
- PageFile: Text;
- IndexFile: TIndexFile;
-
- type
- TWhoAmI = (drbob42_com, intranet);
- var
- WhoAmI: TWhoAmI;
- initialization
- StartTime := timeGetTime;
- Str := ParamStr(0);
- if Pos('D:\INTRANET',UpperCase(STR)) = 1 then { intranet }
- begin
- WhoAmI := intranet;
- Str := 'http://www.bolesian.nl/groups/delphi/drbob42';
- ChDir('groups\delphi\drbob42')
- end
- else { real internet }
- begin
- WhoAmI := drbob42_com;
- Str := 'http://www.drbob42.com';
- ChDir('..')
- end;
- writeln('content-type: text/html');
- writeln;
- writeln('<HTML>');
- writeln('<BODY BACKGROUND="/gif/back.gif">');
- writeln('<H2>IndexBob</H2>');
- writeln('Dr.Bob''s Website Parser version 2.01 - 1998/01/06');
- writeln('<P>Creating index for: www.drbob42.com');
- if WhoAmI = intranet then writeln('(intranet)');
- writeln('<P>');
- writeln('<UL>');
- ScanPages(Str);
- writeln('</UL>');
- {$IFDEF ANALYSE}
- writeln('<P>');
- writeln('<TABLE BORDER>');
- write('<TR><TD BGCOLOR=ABC789>Len.</TD>');
- j := 1;
- repeat
- write('<TD BGCOLOR=A7B7C7>',j,'</TD>');
- j := j + j
- until j > MaxHits;
- writeln('<TD BGCOLOR=A7B7C7>',MaxHits,'</TD><TD BGCOLOR=A7B7C7>255</TD><TD BGCOLOR=A7B7C7>Total</TD></TR>');
- for i:=1 to MaxKeyWord do
- begin
- write('<TR><TD BGCOLOR=ABC789>',i,'</TD>');
- j := 1;
- l := 0;
- repeat
- k := l;
- l := WalkTreeLength(i,j,root);
- write('<TD>',l-k,'</TD>');
- j := j + j
- until j > MaxHits;
- k := l;
- l := WalkTreeLength(i,MaxHits,root);
- writeln('<TD>',l-k,'</TD>');
- k := l;
- l := WalkTreeLength(i,255,root);
- writeln('<TD>',l-k,'</TD><TD>',l,'</TD></TR>');
- end;
- write('<TR><TD BGCOLOR=ABC789>Total</TD>');
- j := 1;
- l := 0;
- repeat
- k := l;
- l := WalkTreeLength(0,j,root);
- write('<TD>',l-k,'</TD>');
- j := j + j
- until j > MaxHits;
- k := l;
- l := WalkTreeLength(0,MaxHits,root);
- writeln('<TD>',l-k,'</TD>');
- k := l;
- l := WalkTreeLength(0,255,root);
- writeln('<TD>',l-k,'</TD><TD>',l,'</TD></TR>');
- writeln('</TABLE>');
- writeln('<P>');
- {$ENDIF}
- if WhoAmI = intranet then ChDir('\intranet\cgi_bin')
- else ChDir('cgi-bin');
- if IOResult <> 0 then { skip };
- assign(IndexFile,'index.bob');
- if root <> nil then
- try
- {$IFDEF BLOCK}
- rewrite(IndexFile,1);
- {$ELSE}
- rewrite(IndexFile);
- {$ENDIF}
- Keywords := 0; { real counter }
- writeln('Skipped common keywords:');
- writeln('<OL>');
- WriteTree(IndexFile,root);
- writeln('</OL>')
- finally
- close(IndexFile)
- end;
- assign(PageFile,'pages.bob');
- try
- rewrite(PageFile);
- for i:=0 to WebPages-1 do
- writeln(PageFile,WebPage[i]);
- {$IFDEF BLOCK}
- writeln(PageFile,KeyWords)
- {$ENDIF}
- finally
- close(PageFile)
- end;
- assign(PageFile,'title.bob');
- try
- rewrite(PageFile);
- for i:=0 to WebPages-1 do
- writeln(PageFile,Titles[i]);
- finally
- close(PageFile)
- end;
- writeln('<HR>');
- writeln('<FONT SIZE=1>');
- writeln('Webpages: ',WebPages);
- writeln('<BR>Keywords: ',Keywords);
- writeln('<BR>Index Time: ',(timeGetTime-StartTime)/1000:1:2,' sec.');
- writeln('</FONT>');
- writeln('<HR>');
- writeln('</BODY>');
- writeln('</HTML>')
- finalization
- root.Free
- end.
-